This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
The following are the R libray used in the analysis:
library(statsr)
library(dplyr)
library(igraph)
library(readxl)
library(ggplot2)
library(ggVennDiagram)
library(ggpubr)
All data is captured in an excel file. The following R code lodes the data from the “Articles’ tab in the excel file for analysis:
# Let's load up the data from excel file
article_list <- read_excel("Data.xlsx", sheet = "Articles")
The following table shows the mean and median number of articles published:
article_count_by_year <- article_list %>%
group_by(Year) %>% summarise(N = n()) %>%
mutate(cum_N = cumsum(N)) %>% select(Year, N, cum_N)
article_count_by_year %>% summarise(mean(N), sd(N), median(N), quantile(N, 0.25), quantile(N, 0.75), IQR(N))
## # A tibble: 1 × 6
## `mean(N)` `sd(N)` `median(N)` `quantile(N, 0.25)` `quantile(N, 0.75)` `IQR(N)`
## <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 3.92 2.80 3 2 5 3
Fig 2. A scatter plot of the cumulative number of articles published over the years and a bar chart of the annual article count:
# combine bar and line chart
ggplot(article_count_by_year) +
geom_bar(aes(x=Year, y=N*2),stat="identity", fill="#5599EE",colour="#006000")+
scale_x_continuous(breaks=seq(1998,2022,1)) +
theme(axis.text.x = element_text(face="bold", angle=90))+
scale_y_continuous(breaks=seq(0,110,10), name = "Cumulative Total", sec.axis=sec_axis(~./2,name="Article Count"))+
geom_point(aes(x=Year, y=cum_N), size = 2) +
geom_smooth(aes(x=Year, y=cum_N), method=lm, se=FALSE, linetype = "dashed")
## `geom_smooth()` using formula = 'y ~ x'
# geom_line(aes(x=Year, y=cum_N/8), method=lm, se=FALSE)
The following table shows the number of articles published grouped by 5 yearly bin from 1998-2022:
# Add two new columns year_bin and year_group
article_list <- article_list %>% mutate(year_bin = (max(article_list$Year)-Year) %/% 5) %>%
mutate(Period = case_when(year_bin == 0 ~ "2018-2022",
year_bin == 1 ~ "2013-2017",
year_bin == 2 ~ "2008-2012",
year_bin == 3 ~ "2003-2007",
year_bin == 4 ~ "1998-2002"))
#article_list
ar_bin <- article_list %>% group_by(Period) %>% summarise(N = n()) %>% mutate(perc_bin = formattable::percent(N / sum(N)))
ar_bin
## # A tibble: 5 × 3
## Period N perc_bin
## <chr> <int> <formttbl>
## 1 1998-2002 15 15.31%
## 2 2003-2007 21 21.43%
## 3 2008-2012 18 18.37%
## 4 2013-2017 26 26.53%
## 5 2018-2022 18 18.37%
# Display the total number of articles group by country
df_c <- article_list %>% group_by(Country) %>% summarise(N = n())
df_c %>% arrange(desc(N))
## # A tibble: 15 × 2
## Country N
## <chr> <int>
## 1 USA 29
## 2 Japan 22
## 3 South Korea 15
## 4 Egypt 11
## 5 Hungary 7
## 6 Australia 2
## 7 Canada 2
## 8 Slovakia 2
## 9 UK 2
## 10 China 1
## 11 Italy 1
## 12 Pakistan 1
## 13 Philippines 1
## 14 Spain 1
## 15 Vietnam 1
df_c <- df_c %>%
mutate(C_group = case_when(N > 2 ~ Country, TRUE ~ "Others")) %>%
group_by(C_group) %>% summarise(N_group = sum(N)) %>%
mutate(perc = formattable::percent(N_group / sum(N_group))) %>%
rename(Country = C_group) %>%
arrange(desc(N_group))
df_c
## # A tibble: 6 × 3
## Country N_group perc
## <chr> <int> <formttbl>
## 1 USA 29 29.59%
## 2 Japan 22 22.45%
## 3 South Korea 15 15.31%
## 4 Others 14 14.29%
## 5 Egypt 11 11.22%
## 6 Hungary 7 7.14%
df_c$Country <- factor(df_c$Country , levels=c("USA", "Japan", "South Korea", "Egypt", "Hungary", "Others"))
#plot the pie chart
#Define a blank theme
blank_theme <- theme_minimal()+
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.border = element_blank(),
panel.grid=element_blank(),
axis.ticks = element_blank(),
plot.title=element_text(hjust = 1.5, size=14, face="bold")
)
plot_c <- ggplot(df_c, aes(x="", y=perc, fill=Country)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start=0) + blank_theme +
theme(axis.text.x=element_blank()) +
geom_text(aes(x=1.55, label = Country), position = position_stack(vjust=0.4)) +
geom_text(aes(x=1.4, label = perc), position = position_stack(vjust=0.5)) +
ggtitle("Articles Breakdown by Country")
plot_c
# Display the total number of articles group by language
df_l <- article_list %>% group_by(Language) %>% summarise(N = n()) %>% mutate(perc = formattable::percent(N / sum(N))) %>% arrange(desc(N))
df_l
## # A tibble: 3 × 3
## Language N perc
## <chr> <int> <formttbl>
## 1 English 81 82.65%
## 2 Japanese 11 11.22%
## 3 Korean 6 6.12%
#plot the pie chart
plot_l <- ggplot(df_l, aes(x="", y=perc, fill=Language)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start=0) + blank_theme +
theme(axis.text.x=element_blank()) +
geom_text(aes(x=1.6, label = Language), position = position_stack(vjust=0.5)) +
geom_text(aes(x=1.4, label = paste(N,"(",perc,")")), position = position_stack(vjust=0.55)) +
ggtitle("Articles Breakdown by Language") +
theme(legend.position = "none", plot.title=element_text(hjust = 0.5, size=14, face="bold"))
plot_l
# Total number of articles group by year and language
article_list$Language <- factor(article_list$Language , levels=c("Korean", "Japanese", "English"))
df_yl <- article_list %>% group_by(Year, Language) %>% summarise(N = n())
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
# Plot the bar chart
plot_yl <- ggplot(df_yl, aes(x = Year, y=N, fill = Language)) +
geom_bar(stat = "identity") +
geom_text(aes(label = N), size=3, hjust = 0.5, vjust = 1.5, position ="stack") +
scale_x_continuous(breaks=seq(1998,2022,1)) +
scale_y_continuous(breaks=seq(0,12,2)) +
theme(axis.text.x = element_text(face="bold", angle=90)) +
ggtitle("Number of Article Published Over the Years Breakdown By Language")
plot_yl
#### Publication Types
# Display the total number of articles group by ArticleType
df_at <- article_list %>% group_by(ArticleType) %>% summarise(N = n()) %>% mutate(perc = formattable::percent(N / sum(N))) %>% arrange(desc(N))
df_at
## # A tibble: 6 × 3
## ArticleType N perc
## <chr> <int> <formttbl>
## 1 Full paper 85 86.73%
## 2 Abstract 8 8.16%
## 3 Book Chapter 2 2.04%
## 4 Short Communication 1 1.02%
## 5 Study Protocol 1 1.02%
## 6 Thesis 1 1.02%
#plot the pie chart
plot_at <- ggplot(df_at, aes(x="", y=perc, fill=ArticleType)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start=0) + blank_theme +
theme(axis.text.x=element_blank()) +
geom_text(aes(x=1.6, label = perc), position = position_stack(vjust=0.5)) +
#geom_text(aes(x=1.4, label = perc), position = position_stack(vjust=0.5)) +
ggtitle("Articles Breakdown by Article Type")
plot_at
# Use VennDiagram to represent the Preclinical Study Design
I_list <- article_list %>% filter(InVitroStudy=="Yes") %>% select(UID)
A_list <- article_list %>% filter(AnimalStudy=="Yes") %>% select(UID)
O_list <- article_list %>% filter(ChemicalAnalysis=="Yes") %>% select(UID)
venn_list <- list(Animal= A_list$UID, Chemical=O_list$UID, Cell = I_list$UID)
ggVennDiagram(venn_list, set_color = c("red","green","blue"))
# Count the different preclinical study types
# Count the total number of articles group by Clinical Study Design
article_list$PreclinicalDesign <- factor(article_list$PreclinicalDesign, levels = c("Animal" , "Animal+Cell", "Animal+Cell+Chemical", "Cell", "Cell+Chemical", "Chemical"))
df_ht <- article_list %>% filter(!is.na(PreclinicalDesign)) %>% group_by(PreclinicalDesign) %>%
summarise(N = n()) %>% mutate(perc = formattable::percent(N / nrow(article_list)))
df_ht
## # A tibble: 6 × 3
## PreclinicalDesign N perc
## <fct> <int> <formttbl>
## 1 Animal 25 25.51%
## 2 Animal+Cell 6 6.12%
## 3 Animal+Cell+Chemical 3 3.06%
## 4 Cell 19 19.39%
## 5 Cell+Chemical 2 2.04%
## 6 Chemical 1 1.02%
# Count the total number of articles group by Clinical Study Design
article_list$ClinicalDesign <- factor(article_list$ClinicalDesign, levels = c("Randomised controlled trial" , "Non-randomised controlled trial", "Before and after study", "Descriptive cross-sectional studies", "Case series", "Case report"))
df_ht <- article_list %>%
filter(!is.na(ClinicalDesign)) %>%
group_by(ClinicalDesign) %>%
summarise(N = n()) %>%
mutate(perc = formattable::percent(N / nrow(article_list)))
df_ht
## # A tibble: 6 × 3
## ClinicalDesign N perc
## <fct> <int> <formttbl>
## 1 Randomised controlled trial 21 21.43%
## 2 Non-randomised controlled trial 1 1.02%
## 3 Before and after study 7 7.14%
## 4 Descriptive cross-sectional studies 1 1.02%
## 5 Case series 4 4.08%
## 6 Case report 8 8.16%
# Total number of articles group by preclinical and clinical studies
df_st1 <- article_list %>%
group_by(Period, StudyType) %>%
summarise(Total = n()) %>%
left_join(subset(ar_bin, select=c("Period", "N")), by = c("Period" = "Period") ) %>%
mutate(Ratio = formattable::percent(Total / N))
## `summarise()` has grouped output by 'Period'. You can override using the
## `.groups` argument.
# Plot the bar chart by number
plot_st1_n <- ggplot(df_st1, aes(x = Period, y=Total, fill = StudyType)) +
geom_bar(stat = "identity") +
geom_text(aes(label = Total), size=3, hjust = 0.5, vjust = 1.5, position ="stack") +
theme(axis.text.x = element_text(face="bold", angle=0)) +
ggtitle("Number of Article Published Over 5 Year period Breakdown By Study Type")
# Plot the bar chart by ratio
plot_st1 <- ggplot(df_st1, aes(x = Period, y=Ratio, fill = StudyType)) +
geom_bar(stat = "identity") +
geom_text(aes(label = Ratio), size=3, hjust = 0.5, vjust = 1.5, position ="stack") +
theme(axis.text.x = element_text(face="bold", angle=0)) +
ggtitle("Ratio of Article Published Over 5 Year period Breakdown By Study Type")
ggarrange(plot_st1_n, plot_st1,labels = c("A", "B"), ncol = 1, nrow = 2)
# Further breakdown clinical studies into observational and interventional
article_list <- article_list %>%
mutate(StudyDesign = case_when(as.numeric(ClinicalDesign) < 4 ~ "Interventional",
as.numeric(ClinicalDesign) > 0 ~ "Observational",
TRUE ~ "Preclincal"))
# Display the ratio of of articles group by preclinical observational and interventional studies over the years
df_st2 <- article_list %>%
group_by(Period, StudyDesign) %>%
summarise(Total = n()) %>%
left_join(subset(ar_bin, select=c("Period", "N")), by = c("Period" = "Period") ) %>%
mutate(Ratio = formattable::percent(Total / N))
## `summarise()` has grouped output by 'Period'. You can override using the
## `.groups` argument.
# Plot the Ratio bar chart
plot_st2_n <- ggplot(df_st2, aes(x = Period, y=Total, fill = StudyDesign)) +
geom_bar(stat = "identity") +
geom_text(aes(label = Total), size=3, hjust = 0.5, vjust = 1.5, position ="stack") +
theme(axis.text.x = element_text(face="bold", angle=0))
#+ ggtitle("Total Article Published Over 5 Year period Breakdown By Study Design")
#plot_st2_n
# Plot the Ratio bar chart
plot_st2 <- ggplot(df_st2, aes(x = Period, y=Ratio, fill = StudyDesign)) +
geom_bar(stat = "identity") +
geom_text(aes(label = Ratio), size=3, hjust = 0.5, vjust = 1.5, position ="stack") +
theme(axis.text.x = element_text(face="bold", angle=0))
Fig. 3 The number of articles published over time by study design: (A) the absolute count and (B) the relative percentage.
mixed_plot <- ggarrange(plot_st2_n, plot_st2,labels = c("A", "B"), ncol = 1, nrow = 2)
mixed_plot
# Further breakdown studies based on translational stages
article_count_by_tstage <- article_list %>%
group_by(Tstage) %>% summarise(N = n()) %>%
mutate(Total = sum(N)) %>% select(Tstage, N, Total) %>%
mutate(Ratio = formattable::percent(N / Total))
article_count_by_tstage
## # A tibble: 3 × 4
## Tstage N Total Ratio
## <dbl> <int> <int> <formttbl>
## 1 0 56 98 57.14%
## 2 1 38 98 38.78%
## 3 2 4 98 4.08%
# Display the total number of articles group by source
df_s <- article_list %>% group_by(Source) %>% summarise(N = n()) %>% mutate(perc =formattable::percent( N / sum(N)))
df_s
## # A tibble: 3 × 3
## Source N perc
## <chr> <int> <formttbl>
## 1 Daiwa 85 86.73%
## 2 Erom 8 8.16%
## 3 STR Biotech 5 5.10%
#plot the pie chart
plot_s <- ggplot(df_s, aes(x="", y=perc, fill=Source)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start=0) + blank_theme +
theme(axis.text.x=element_blank()) +
geom_text(aes(x=1.6, label = Source), position = position_stack(vjust=0.5)) +
geom_text(aes(x=1.4, label = paste(N,"(",perc,")")), position = position_stack(vjust=0.55)) +
ggtitle("Articles Breakdown by Product Source") +
theme(legend.position = "none", plot.title=element_text(hjust = 0.5, size=14, face="bold")) +
scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9"))
plot_s
#### Funding analysis
# Let's load up the funding data from excel file
funding_list <- read_excel("Data.xlsx", sheet = "Funding") %>%
mutate(Disclosure = ifelse(Source != "Unknown", "Yes", "No"))
funding_list %>% group_by(Disclosure) %>% summarise(N = n()) %>% mutate(perc = formattable::percent(N / sum(N)))
## # A tibble: 2 × 3
## Disclosure N perc
## <chr> <int> <formttbl>
## 1 No 50 51.02%
## 2 Yes 48 48.98%
# Public vs private/commercial
funding_list %>% filter(Disclosure == "Yes") %>%
mutate(Public = ifelse(is.na(Public), "No", "Yes")) %>%
group_by(Public) %>% summarise(N = n()) %>%
mutate(perc = formattable::percent(N / sum(N)))%>%
mutate(gperc = formattable::percent(N / nrow(funding_list)))
## # A tibble: 2 × 4
## Public N perc gperc
## <chr> <int> <formttbl> <formttbl>
## 1 No 31 64.58% 31.63%
## 2 Yes 17 35.42% 17.35%
# Nonprofit?
funding_list %>% filter(Disclosure == "Yes") %>%
mutate(P_Nonprofit = ifelse(is.na(P_Nonprofit), "No", "Yes")) %>%
group_by(P_Nonprofit) %>% summarise(N = n()) %>%
mutate(perc = formattable::percent(N / sum(N)))%>%
mutate(gperc = formattable::percent(N / nrow(funding_list)))
## # A tibble: 2 × 4
## P_Nonprofit N perc gperc
## <chr> <int> <formttbl> <formttbl>
## 1 No 42 87.50% 42.86%
## 2 Yes 6 12.50% 6.12%
# Commercial
funding_list %>% filter(Disclosure == "Yes") %>%
mutate(Commercial = ifelse(is.na(Commercial), "Non-Commercial", Commercial)) %>%
group_by(Commercial) %>% summarise(N = n()) %>%
mutate(perc = formattable::percent(N / sum(N)))%>%
mutate(gperc = formattable::percent(N / nrow(funding_list)))
## # A tibble: 4 × 4
## Commercial N perc gperc
## <chr> <int> <formttbl> <formttbl>
## 1 BioMedica 1 2.08% 1.02%
## 2 Daiwa 26 54.17% 26.53%
## 3 Erom 5 10.42% 5.10%
## 4 Non-Commercial 16 33.33% 16.33%
# Daiwa - Product Only?
funding_list %>% filter(Disclosure == "Yes") %>%
mutate(`Product Only` = ifelse(is.na(`Product Only`), "No", `Product Only`)) %>%
group_by(`Product Only`) %>% summarise(N = n()) %>%
mutate(perc = formattable::percent(N / sum(N)))%>%
mutate(gperc = formattable::percent(N / nrow(funding_list)))
## # A tibble: 2 × 4
## `Product Only` N perc gperc
## <chr> <int> <formttbl> <formttbl>
## 1 Daiwa 8 16.67% 8.16%
## 2 No 40 83.33% 40.82%
funding_list <- funding_list %>%
mutate(Disclosure = ifelse(Source != "Unknown", "Yes", "No")) %>%
mutate(year_bin = (max(funding_list$Year)-Year) %/% 5) %>%
mutate(Period = case_when(year_bin == 0 ~ "2018-2022",
year_bin == 1 ~ "2013-2017",
year_bin == 2 ~ "2008-2012",
year_bin == 3 ~ "2003-2007",
year_bin == 4 ~ "1998-2002"))
df_yd <- funding_list %>% group_by(Period, Disclosure) %>% summarise(N = n()) %>% mutate(perc = formattable::percent(N / sum(N)))
## `summarise()` has grouped output by 'Period'. You can override using the
## `.groups` argument.
df_yd
## # A tibble: 10 × 4
## # Groups: Period [5]
## Period Disclosure N perc
## <chr> <chr> <int> <formttbl>
## 1 1998-2002 No 13 86.67%
## 2 1998-2002 Yes 2 13.33%
## 3 2003-2007 No 11 52.38%
## 4 2003-2007 Yes 10 47.62%
## 5 2008-2012 No 11 61.11%
## 6 2008-2012 Yes 7 38.89%
## 7 2013-2017 No 10 38.46%
## 8 2013-2017 Yes 16 61.54%
## 9 2018-2022 No 5 27.78%
## 10 2018-2022 Yes 13 72.22%
# Plot the bar chart
plot_yd <- ggplot(df_yd, aes(x = Period, y=perc, fill = Disclosure)) +
geom_bar(stat = "identity") +
geom_text(aes(label = perc), size=3, hjust = 0.5, vjust = 1.5, position ="stack") +
#scale_x_continuous(breaks=seq(1998,2022,1)) +
#scale_y_continuous(breaks=seq(0,12,2)) +
theme(axis.text.x = element_text(face="bold", angle=0)) +
ggtitle("Proportion of Article Published Over the Years With Funding Disclosure ")
plot_yd